import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from plotnine import *
%matplotlib inline
import matplotlib.pyplot as plt
from unicodedata import normalize
import warnings
warnings.filterwarnings("ignore")
df1 = pd.read_csv("/Users/FrancesY/Documents/Columbia/spring2022/5063viz/hw11/assignment-1---winter-olympics-franyang6/data/athletes_and_events.csv")
df2 = pd.read_csv("/Users/FrancesY/Documents/Columbia/spring2022/5063viz/hw11/assignment-1---winter-olympics-franyang6/data/gdp_pop.csv")
df3 = pd.read_csv("/Users/FrancesY/Documents/Columbia/spring2022/5063viz/hw11/assignment-1---winter-olympics-franyang6/data/noc_regions.csv")
df1.head()
df2 = df2.rename(columns={"Code": "NOC"})
df2.head()
df3.head()
df4 = pd.merge(df1, df2, on="NOC")
df5 = pd.merge(df4,df3, on="NOC")
df5 = df5[~df5['Medal'].isnull()]
df5 = df5[df5["Season"] == "Winter"]
df5 = df5.drop_duplicates(
subset = ['Country','Year', 'Event','Medal'],
keep = 'last').reset_index(drop = True)
df5.head()
NOC_rank = df5.groupby("NOC").size().sort_values(ascending=False)
NOC_top10 = NOC_rank[:10]
NOC_top10
# Filtering countries to top10
df6 = df5[df5["NOC"].isin(list(NOC_top10.index))]
n_of_games = df6.groupby("Country").Year.nunique()
n_of_games = n_of_games.reset_index()
n_of_games = n_of_games.rename(columns = {"Year":"Count"})
n_of_games = n_of_games.sort_values("Count",ascending = False)
n_of_games
(ggplot(n_of_games,
aes(x = "reorder(Country, Count)", y = "Count")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
geom_text(aes(label="Count"), color="black", size=8.5)+
labs(x = "Country", title = "How Many Winter Games Each Country Competed In") +
coord_flip() + theme_bw())
howmany = df6.groupby(["Country","Medal"]).size()
howmany = howmany.reset_index()
howmany = howmany.rename(columns = {0:"Count"})
howmany
cat_order = ['Gold','Silver','Bronze']
howmany['Medal'] = pd.Categorical(howmany['Medal'], categories=cat_order, ordered=True)
(ggplot(howmany, aes(x = "reorder(Country, Count)", y = "Count", fill = "Medal")) +
geom_bar(stat="identity", alpha=.6, width=.4) +
scale_fill_manual(values = ["#FFD700", "#C0C0C0", "#CD7F32"]) +
labs(x = "Country", title = "How Many Medals of Each Type the Country Won") +
coord_flip() + theme_bw())
df6['Country'] = df6['Country'].str.replace('United States', 'United_States')
df7 = df6[df6["Country"].isin(["United_States","Canada"])]
df7['CountrySex'] = df7['Country'] + df7['Sex']
df8_1 = df7.groupby(["CountrySex","Year"]).size().reset_index()
df8_1 = df8_1.rename(columns = {0:"Count"})
df8_1['Cumulative'] = df8_1.groupby("CountrySex")['Count'].cumsum()
df8_1
(ggplot(df8_1, aes(x = "Year", y = "Cumulative", fill = "CountrySex")) +
labs(y = "Cumulative Medal Count", title = "Overtime Comparison 1") +
scale_fill_manual(values = ["#ff0000", "#1261A0", "#801B1B","#00008b"]) +
geom_bar(stat = "identity", width= 3, position = "dodge"))
df9 = df6.groupby(["Country","Sex","Medal"]).size().reset_index()
df9 = df9.rename(columns = {0:"Count"})
df10 = df9[df9["Country"].isin(["United_States","Canada"])]
df10.reset_index(inplace = True)
df10.drop(columns = "index",inplace=True)
df9.head()
df10 = df7.groupby(["Country","Year","Sex"]).size().reset_index()
df10 = df10.rename(columns = {0:"Count"})
df10['Cumulative'] = df10.groupby(["Country","Sex"])['Count'].cumsum()
df10.head()
(ggplot(df10, aes(x = "Year", y = "Cumulative", fill = "Sex")) +
geom_col()+
facet_wrap("Country", scales = "free_x") +
coord_flip() +
labs(y = "Cumulative Medal Count", title = "Overtime Compairson 2") +
theme_bw())
df13 = df5.groupby("Country").size().sort_values(ascending=False).reset_index()
df13 = df13.rename(columns = {0:"Count"})
df14 = pd.merge(df13, df2, on="Country")
df14["Count_pop"] = df14["Count"]/(df14["Population"])
df14["Count_GDP"] = df14["Count"]/(df14["GDP per Capita"])
df14.head()
df14_1 = df14.sort_values("Count", ascending= False)[:20]
(ggplot(df14_1, aes(x = "reorder(Country, Count)", y = "Count")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
geom_text(aes(label="Count"), color="black", size=8.5)+
labs(x = "Country", title = "Medal Counts Unadjusted Top 20 Countries") +
geom_smooth() +
coord_flip() + theme_bw())
df14_2 = df14.sort_values("Count_GDP", ascending= False)[:20]
(ggplot(df14_2, aes(x = "reorder(Country, Count_GDP)", y = "Count_GDP")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
labs(x = "Country", title = "Medal Counts Adjusted by GDP per Capita Top 20 Countries") +
geom_smooth() +
coord_flip() + theme_bw())
df14_3 = df14.sort_values("Count_pop", ascending= False)[:20]
(ggplot(df14_3, aes(x = "reorder(Country, Count_pop)", y = "Count_pop")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
labs(x = "Country", title = "Medal Counts Adjusted by Population Top 20 Countries") +
geom_smooth() +
coord_flip() + theme_bw())
# Ranking countries by Unadjusted Count(CountRank), Count adjusted by GDP per capita(GDPRank) and Count adjusted by population (PopRank)
df14['CountRank'] = df14['Count'].rank(method='dense', ascending=False)
df14['GDPRank'] = df14['Count_GDP'].rank(method='dense', ascending=False)
df14['PopRank'] = df14['Count_pop'].rank(method='dense', ascending=False)
df14.head()
df14_4 = df14[:20]
# Ordering dataframe by Unadjusted Medal Count
ordered_df = df14_4.sort_values(by='Count')
my_range=range(1,len(df14_4.index)+1)
plt.hlines(y=my_range, xmin=0,xmax=40, color='grey', alpha=0.1)
plt.hlines(y=(8,12), xmin=0,xmax=40, color='red', alpha=0.3)
plt.scatter(ordered_df['CountRank'], my_range, color='navy', alpha=1, label='Medal Rank Unadjusted')
plt.scatter(ordered_df['GDPRank'], my_range, color='gold', alpha=0.5 , label='Medal Rank Adjusted by GDP per Capita')
plt.scatter(ordered_df['PopRank'], my_range, color='red', alpha=0.5, label='Medal Rank Adjusted by Population')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.yticks(my_range, ordered_df['Country'])
plt.title("Rank Unadjusted, Adjusted by GDP Per Capita, and Adjusted by Population", loc='left')
plt.xlabel('Rank')
df15 = pd.read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")[1]
host_list = df15[['City.1','Country','Year']]
host_list = host_list.rename(columns = {"City.1":"City"})
df16 = pd.merge(df5, host_list, on=["City","Year"])
df16.head()
df17 = df16[df16["Country_x"] == df16["Country_y"]]
hosted = df17.groupby(["Country_x","Year"]).size()
hosted = hosted.reset_index()
hosted = hosted.rename(columns = {0:"Count"})
hosted.head()
hosted1 = df5[df5['Country'].isin(hosted["Country_x"])]
hosted2 = hosted1.groupby(["Country","Year"]).size().reset_index()
hosted2 = hosted2.rename(columns = {0:"Count"})
hosted2.head()
(ggplot(hosted2, aes(x = "Year", y = "Count", color = "Country")) +
geom_line() +
labs(y = "Medal Count", title = "Host Country Advantage") +
theme_bw()) + geom_point(data=hosted, color = "black", size = 2)
successful = df5.groupby(["Name"]).size().sort_values(ascending=False).reset_index()
successful = successful.rename(columns = {0:"Count"})
successful = successful[:10]
(ggplot(successful,
aes(x = "reorder(Name, Count)", y = "Count")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
geom_text(aes(label="Count"), color="black", size=8.5)+
labs(x = "Name", y = "Total Medal Count", title = "The Most Successful Winte Olympics Athletes of All Time") +
coord_flip() + theme_bw())
info = pd.merge(successful, df5, on="Name")
info = info.drop_duplicates(subset=['Name']).reset_index().drop(columns = "index")
info['BMI'] = info['Weight'] / ((info['Height']/100) ** 2)
info['BMI'] = round(info['BMI'], 2)
(ggplot(info,
aes(x = "reorder(Name, Count)", y = "BMI")) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
geom_text(aes(label="BMI"), color="black", size=8.5)+
labs(title = "Body Mass Index of The Most Successful Winte Olympics Athletes of All Time") +
coord_flip() + theme_bw())
import plotly.express as px
import plotly.graph_objs as go
df14.head()
fig = px.scatter(df14, x = "GDP per Capita", y = "Count", size = "Population",
color = "Country",hover_name = "Country",log_x = True, size_max = 100).update_layout(
yaxis_title="Total Medal Count"
)
fig.show()
athletes = df5.groupby("Name").size().reset_index()
athletes = athletes.rename(columns = {0:"Count"})
athletes = athletes.sort_values("Count", ascending = False)[:500]
info2 = pd.merge(athletes, df5, on="Name")
info2 = info2.drop_duplicates(subset=['Name']).reset_index().drop(columns = "index")
info2['BMI'] = info2['Weight'] / ((info2['Height']/100) ** 2)
info2['BMI'] = round(info2['BMI'], 2)
fig2 = px.scatter(info2, x = "Age", y = "BMI",
color = "Sport", hover_name = "Name",log_x = True).update_layout(
yaxis_title="BMI", title = "BMI of Top 200 Olympic Winter Athletes"
)
fig2.show()
df5.to_csv("df5.csv")
%reload_ext rpy2.ipython
%%R
library(DT)
rdf <- read.csv(file = 'df5.csv')
head(rdf)
datatable(rdf, filter = 'top', options = list(
pageLength = 5, autoWidth = TRUE))